/***************************************************************************
 *
 * Copyright (C) 2001 International Business Machines
 * All rights reserved.
 *
 * This file is part of the GPFS mmfslinux kernel module.
 *
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions 
 * are met:
 *
 *  1. Redistributions of source code must retain the above copyright notice, 
 *     this list of conditions and the following disclaimer. 
 *  2. Redistributions in binary form must reproduce the above copyright 
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution. 
 *  3. The name of the author may not be used to endorse or promote products 
 *     derived from this software without specific prior written
 *     permission. 
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 *************************************************************************** */
/*
 * File operations
 *
 * Contents:
 *   gpfs_f_llseek
 *   gpfs_f_readdir
 *   gpfs_f_poll
 *   gpfs_f_ioctl
 *   gpfs_file_mmap
 *   gpfs_filemap_nopage
 *   gpfs_filemap_open
 *   gpfs_filemap_close
 *   gpfs_f_mmap
 *   gpfs_f_open
 *   gpfs_f_release
 *   gpfs_f_fsync
 *   gpfs_f_fasync
 *   fsyncInternal
 *   gpfs_f_check_media_change
 *   gpfs_f_revalidate
 *   gpfs_f_lock
 *   rdwrInternal
 *   gpfs_f_read
 *   gpfs_f_dir_read
 *   gpfs_f_write
 *   gpfs_f_readv
 *   gpfs_f_writev
 *   gpfs_f_cleanup
 *
 * $Id: file.c,v 1.30.2.2 2002/08/01 15:22:36 gjertsen Exp $
 *
 * $Log: file.c,v $
 * Revision 1.30.2.2  2002/08/01 15:22:36  gjertsen
 * Don't allow mmap writes at 2.4.18-3 or higher unless the mmap patch
 * is installed (to avoid inadvertant kernel panic).
 *
 * Revision 1.30.2.1  2001/11/21 07:52:52  mcnabb
 * Defect 353917:
 * vm_ops->close() always be done after calling vm_ops->open() even if
 * returns error. Because of this Linux semantics we need to increment
 * mmap counters even if an error occurs.
 *
 * Revision 1.30  2001/10/09 17:45:29  dcraft
 * Fixes for running on 2.4.9-ac kernel series. (behind ifdefs)
 *
 * Revision 1.29  2001/10/03 14:46:12  dcraft
 * First attempt to bring us up to 2.4.9 and 2.4.10
 *
 * Revision 1.28  2001/09/28 20:46:52  wyllie
 * Include more operations in vfsstats counters
 *
 * Revision 1.27  2001/09/25 14:27:50  dcraft
 * Comment change.  No code change.
 *
 * Revision 1.26  2001/09/24 19:37:48  jpalmer
 * Fix NAS defect 6663.  Replace unused l_vfs lock field in 
 * LINUX version with l_caller to distinguish LOCKD requested locks.
 *
 * Revision 1.25  2001/09/24 13:15:15  radhak
 * Got rid of dead code.
 *
 * Revision 1.24  2001/09/04 15:51:20  eshel
 * Performance improvement to unlock the kernel lock on calls from NFSD.
 *
 * Revision 1.23  2001/08/29 00:20:34  jpalmer
 * Change content of OpenFile advLkObjP from a file pointer to an 
 * inode pointer.  The file pointer cannot be guaranteed to still be 
 * valid and this was causing failures in revoke when an out-of-date 
 * file pointer was used.
 *
 * Revision 1.22  2001/08/06 23:36:37  wyllie
 * Do not hold kernel lock while doing readdir
 *
 * Revision 1.21  2001/08/04 00:42:26  tee
 * Remove LINUX_MMAP ifdefs
 *
 * Revision 1.20  2001/07/10 12:06:27  jpalmer
 * Add function to allow SMB Open and FCNTL tokens to be moved from one system to
 * another in response to the NAS load balancer moving the users.  Add the
 * external interface to provide a lock clamping function that will block new
 * locks during the time NFS is recovering its FCNTL and File locks.
 *
 * Revision 1.19  2001/06/21 00:16:24  tee
 * Make readv and writev system calls with more than one iovec element work.
 * Only the first element was being passed to rdwrInternal, so GPFS would pick
 * up garbage for address and count values of the remaining elements causing
 * kernel panics or other problems.
 *
 * Revision 1.18  2001/06/14 18:14:07  gjertsen
 * Initial changes for IA64 beta RH 7.1 with 2.4.3-3 kernel. Get GPFS_PRINTF
 * working again.
 *
 * Revision 1.17  2001/06/07 01:38:59  manoj
 * On 64-bit flock calls, Linux converts flock64 structure to 32-bit flock
 * before calling GPFS lock routine, but leaves the commands (F_GETLK64, etc.)
 * as is. Allow these to go through.
 *
 * Revision 1.16  2001/05/25 14:48:19  gjertsen
 * Minor fixes to get IA64 code to compile again.
 *
 * Revision 1.15  2001/05/18 19:23:56  radhak
 * Defect 339937:
 * Enabled single node mmap semantics and fixed deadlock found by Tom
 *
 * Revision 1.14  2001/05/09 00:43:33  wsawdon
 * Fixed bugs in linux readdir returning internal inode number
 * instead of external snapshot/inode numbers.
 *
 * Revision 1.13  2001/05/08 13:40:34  dixonbp
 * kxRegisterCleanup for linux and gpfs_f_cleanup/gpfsCleanup
 * to do the equivalent of what fopClose does on aix.
 *
 * Revision 1.12  2001/05/04 23:30:12  schmuck
 * Move "extern struct xxx_operations ..." declarations into .h file
 * instead of replicating them in various .c files.
 * Replace empty gpfs_dops_valid table with a NULL pointer.
 *
 * Revision 1.11  2001/04/29 21:52:31  schmuck
 * SMB_LOCKS code cleanup:
 * In-line openlock state updates on fast open/close path.
 * Remember whether an open (share-mode) lock was obtained;
 * if not, skip cleanup code in close.
 * Removed unused 'isSamba' flag in vinfo.
 *
 * Revision 1.10  2001/04/23 23:08:30  dcraft
 * Fix disable_lock so it actually does what it says it does.
 * Perform FEXCL check before access check on create race condition.
 *
 * Revision 1.9  2001/04/23 18:11:25  eshel
 * Rename createThreadId to createRaceLoserThreadId and fix logic error.
 *
 * Revision 1.8  2001/04/20 23:03:07  eshel
 * If a regular file is created by the sys_open call (for now we can not tell if
 * the call is from sys_mknod or sys_open) and the file is found, return rc 0,
 * remember the thread that called create. Later on the open call the open flags
 * are available and if it is the same thread, and FEXCL was on fail it with
 * EEXIST, also check permission since linux assumes that this process created
 * the file and did not do any permission check.
 *
 * Revision 1.7  2001/04/19 20:49:40  wyllie
 * Add cast to get rid of compiler warning
 *
 * Revision 1.6  2001/04/16 21:07:20  eshel
 * remove call to cxiInvalidateAttr(), OS node attributes are now updated
 *
 * Revision 1.5  2001/04/14 00:29:02  schmuck
 * gpfs_f_open: include file name from the dentry in the trace.
 *
 * Revision 1.4  2001/04/08 22:18:28  dcraft
 * Fix multinde delete race conditions.  Still incomplete.
 *
 * Revision 1.3  2001/04/05 13:31:06  gjertsen
 * Continue C++ to C conversion with manual C++2C utility.
 * Changes primarily for vfs stat stuff.
 *
 * Revision 1.79  2001/04/02 14:33:09  dixonbp
 * Convert mmap.C to mmap.c
 *
 * Revision 1.78  2001/03/30 17:35:23  eshel
 * don't update atime on gpfs_iread (read by inode) call
 *
 * Revision 1.77  2001/03/26 18:29:54  dcraft
 * Update inode attributes in OS node layer via callback to cxiSetOSNode
 * (previous setInode).  The attributes are now updated during kSFSGetattr()
 * while the lock is held to ensure validity.
 *
 * Revision 1.76  2001/03/21 15:56:47  wyllie
 * Trace file offsets and lengths in hex
 *
 * Revision 1.75  2001/03/15 16:52:41  eshel
 * Minor tweak
 *
 * Revision 1.74  2001/03/13 22:15:41  eshel
 * change interface to gpfsWrite()
 *
 * Revision 1.73  2001/03/13 21:42:18  radhak
 * Defect 332458,333008:
 * need to register vm_ops->open function for incrementing mmap reference
 * count for every child process forked after mmap.
 *
 * Revision 1.72  2001/03/08 16:15:22  jpalmer
 * SMB Locking - Detect smbd call, set isSamba in MMFSVinfo
 *
 * Revision 1.71  2001/03/07 20:05:22  jpalmer
 * SMB Open Lock function
 *
 * Revision 1.70  2001/03/05 23:28:09  dcraft
 * Modify inode and gpfsNode reference management.  Inode is now acquired
 * during gpfsNode creation and must be released via cxiPutOSNode().
 * (documented in gpfs database).  Add "mmfsadm dump vnodes" for producing
 * trace info on all held inodes.
 *
 * Revision 1.69  2001/03/01 20:44:20  radhak
 * Need serialization between nopage and mmap flush.
 * Also, always get page table lock while holding page lock.
 *
 * Revision 1.68  2001/01/27 15:42:20  dixonbp
 * NFS fixes to cxiCloseNFS and gpfs_f_lock.  Remove incorrect nfs handling
 * in gpfs_i_validate, and start to handle a nfs problem with gpfs_i_lookup.
 *
 * Revision 1.67  2001/01/25 20:33:22  radhak
 * Removed debug message.
 *
 * Revision 1.66  2001/01/19 20:55:11  radhak
 * LINUX_MMAP: linux 2.4.0 kernel rework.
 * For time being disabled single node mmap symantics because of lack of
 * interface to invalidate memory mapped pages
 *
 * Revision 1.65  2001/01/15 14:52:43  dixonbp
 * Fix checking for nfsd close in gpfs_f_release.
 *
 * Revision 1.64  2001/01/09 15:27:06  dixonbp
 * Remove an old ifdef
 *
 * Revision 1.63  2000/12/29 22:22:30  radhak
 * Defect 322452: Before calling gpfs_filemap_sync get lock.
 * Also added some traces.
 *
 * Revision 1.62  2000/12/19 21:11:56  wyllie
 * Remove assertions and traces about the state of the Linux BKL.  Linux does
 * not keep track of who owns the lock, so these asserts were blowing up on
 * an SMP if the kernel lock happened to be held by the other processor.
 *
 * Revision 1.61  2000/12/18 13:53:15  gjertsen
 * More cleanup of comments/documentation.
 *
 * Revision 1.60  2000/12/15 20:19:52  dcraft
 * Fix build break with mmapLock
 *
 * Revision 1.59  2000/12/15 13:56:39  gjertsen
 * Clean up documentation.
 *
 */

#include <Shark-gpl.h>

#include <linux/fs.h>
#include <linux/errno.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#ifdef MODULE
#include <linux/module.h>
#endif
#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/mm.h>
#include <linux/mman.h>

#include <cxiTypes.h>
#include <cxiSystem.h>
#include <cxiMode.h>
#include <cxi2gpfs.h>
#include <cxiVFSStats.h>
#include <cxiCred.h>

#include <linux2gpfs.h>
#include <Trace.h>

/* generic vm_area_ops exported for stackable file systems */
#if 0
extern int filemap_swapout(struct page * page, struct file *file);
extern int filemap_sync(struct vm_area_struct * vma, unsigned long address,
                        size_t size, unsigned int flags);
#endif
extern struct page *filemap_nopage(struct vm_area_struct * area,
                                   unsigned long address, int no_share);
extern void mmapFlushLock(cxiNode_t *cnP,char *buf);
extern void mmapFlushUnLock(cxiNode_t *cnP,char *buf);
#define find_inode_lock_page(mapping, index) \
                __find_lock_page(mapping, index, page_hash(mapping, index))
extern void
flush_vma(cxiNode_t *gnP, struct vm_area_struct *vma,
		  unsigned long address,unsigned long end,
          enum MmflushOption mmfopt);

/* prototypes */
int fsyncInternal(struct file *fP);
static inline int rdwrInternal(struct file *, cxiRdWr_t, const struct cxiIovec_t *,
                               unsigned long, loff_t *);

/* file_operations */

loff_t
gpfs_f_llseek(struct file *fP, loff_t offset, int origin)
{
  struct inode *iP = fP->f_dentry->d_inode;

  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_LLSEEK_ENTER,
         "gpfs_f_llseek enter: fP 0x%lX offset 0x%llX origin %d\n",
         fP, offset, origin);
  /* BKL is held at entry */

  switch (origin)
  {
    case 2:
      gpfs_i_getattr_internal(iP);
      offset += iP->i_size;
      break;

    case 1:
      offset += fP->f_pos;
  }

  if (offset != fP->f_pos)
  {
    fP->f_pos = offset;
    fP->f_reada = 0;
    //    file->f_version = ++event;    // remove for LINUX62 ???
  }

  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_LLSEEK_EXIT,
         "gpfs_f_llseek exit: fP 0x%lX offset 0x%llX origin %d\n",
         fP, offset, origin);
  return offset;
}

int
gpfs_f_readdir(struct file *fP, void *direntP, filldir_t filldir)
{
  int rc;
  struct gpfsVfsData_t *privVfsP;
  cxiNode_t *cnP;
  struct inode *iP;
  cxiFillDirArg_t fillDirArg;

  VFS_STAT_START(readdirCall);
  DBGASSERT(fP != NULL);

  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_READDIR_ENTER,
         "gpfs_f_readdir enter: fP 0x%lX direntP 0x%lX "
         "filldir 0x%lX pos %lld\n", fP, direntP, filldir, fP->f_pos);
  /* BKL is held at entry */

  /* Quick check for EOF */
  if (fP->f_pos == GPFS_DIR_EOF)
  {
    rc = 0;  // end-of-directory
  }
  else
  {

    iP = fP->f_dentry->d_inode;
    DBGASSERT(iP != NULL);
    cnP = VP_TO_CNP(iP);
    privVfsP = VP_TO_PVP(iP);
    DBGASSERT(privVfsP != NULL);

    /* Unfortunately we can't use the OS version of the filldir 
     * routine directly.  It has different signatures in varying
     * kernel levels, so we use cxiFillDir() in the portability layer
     * to handle the different signatures.
     */
    fillDirArg.fnP = (void *)filldir;
    fillDirArg.argP = direntP;

    unlock_kernel();
    rc = gpfs_ops.gpfsReaddir(privVfsP, cnP, &fillDirArg, cxiFillDir,
                              &fP->f_pos, vnOp);
    lock_kernel();
  }

  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_READDIR_EXIT,
         "gpfs_f_readdir exit: fP 0x%lX pos %lld code 0 rc %d\n",
         fP, fP->f_pos, rc);

  VFS_STAT_STOP;
  return (-rc);
}

uint
gpfs_f_poll(struct file *fP, struct poll_table_struct *wait)
{
  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_POLL,
         "gpfs_f_poll enter & exit rc -ENOSYS: fP 0x%lX\n", fP);
  return (uint)-ENOSYS;
}

int
gpfs_f_ioctl(struct inode *iP, struct file *fP, uint cmd, unsigned long arg)
{
  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_IOCTL,
         "gpfs_f_ioctl enter & exit rc -ENOSYS: iP 0x%lX fP 0x%lX cmd %d\n",
         iP, fP, cmd);
  return -ENOSYS;
}


/* This is used for a general mmap of a disk file */

int gpfs_file_mmap(struct file * file, struct vm_area_struct * vma)
{
  int rc = 0, code = 0;
  struct vm_operations_struct * ops;
  struct inode *inode = file->f_dentry->d_inode;
  cxiNode_t *cnP = VP_TO_CNP(inode);
  struct gpfsVfsData_t *privVfsP = VP_TO_PVP(inode);
  cxiVattr_t vattr;

  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_MAP_ENTER,
         "gpfs_filemap_map enter fP 0x%lX inum %d vma 0x%1X\n",
          file,inode->i_ino,vma);

  ops = &gpfs_vmop;

  if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
  {
    if (!inode->i_mapping->a_ops->writepage)
    {
      code = 1;
      rc = -EINVAL;
      goto exit;
    }
  }

  if (!inode->i_sb || !S_ISREG(inode->i_mode))
  {
    code = 2;
    rc = -EACCES;
    goto exit;
  }
  if (!inode->i_mapping->a_ops->readpage)
  {
     code = 3;
     rc = -ENOEXEC;
     goto exit;
  }

  /* revalidate linux inode */
  /* This has the effect of calling us back under a lock and
   * setting the inode attributes at the OS level (since this
   * operating system caches this info in the vfs layer)
   */
  rc = gpfs_ops.gpfsGetattr(privVfsP, cnP, &vattr, false);
  if (rc)
    goto exit;

  UPDATE_ATIME(inode);
  vma->vm_ops = ops;

exit:
  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_MAP_EXIT,
        "gpfs_file_mmap exit: rc %d code %d\n",
         rc,code);
  return rc;
}

struct page * gpfs_filemap_nopage(struct vm_area_struct * area,
                             unsigned long address, int no_share)
{
  struct page *pageP;
  struct file *fP = area->vm_file;
  struct inode *inode = fP->f_dentry->d_inode;
  cxiNode_t *cnP = VP_TO_CNP(inode);
  char buf[] = "gpfs_filemap_nopage()";

  TRACE3(TRACE_VNODE, 2, TRCID_LINUXOPS_NOPAGE,
         "gpfs_filemap_nopage:area 0x%lX, address 0x%lX, no_share %d \n",
         area,address,no_share);
  TRACE2(TRACE_VNODE, 2, TRCID_LINUXOPS_NOPAGE1,
         "gpfs_filemap_nopage:i_no %d name %s\n",
         inode->i_ino,fP->f_dentry? fP->f_dentry->d_name.name: (const unsigned char*)"");

  pageP = filemap_nopage(area,address,no_share);
  if (pageP == NULL)
  {
     TRACE2(TRACE_VNODE, 2, TRCID_LINUXOPS_NOPAGE_NULL,
            "gpfs_filemap_nopage:area 0x%lX, address 0x%lX pageP = NULL\n",
             area,address);
     goto exit;
  }
  TRACE4(TRACE_VNODE, 2, TRCID_LINUXOPS_PAGE,
         "gpfs_filemap_nopage: pageP 0x%lX pageIndex %d pageCount %d "
         "pageFlags 0x%lX\n",
          pageP,pageP->index,page_count(pageP),pageP->flags);
  exit:
  return pageP;
}

void gpfs_filemap_open(struct vm_area_struct * vma)
{
   int rc = 0;
   Boolean writeAccess = false;
   cxiNode_t *cnP;
   ext_cred_t eCred;
   struct file *file = vma->vm_file;
   struct inode *inode = file->f_dentry->d_inode;
   struct gpfsVfsData_t *privVfsP;
   struct mm_struct *mm = vma->vm_mm;

   TRACE4(TRACE_VNODE, 2, TRCID_FM_OPEN,
         "gpfs_filemap_open enter: vma %08X inode %d icount %d name %s\n",
         vma,inode->i_ino,atomic_read((atomic_t *)&inode->i_count),
         file->f_dentry? file->f_dentry->d_name.name: (const unsigned char*)"");
   cnP = VP_TO_CNP(inode);
   privVfsP = VP_TO_PVP(inode);
   DBGASSERT(privVfsP != NULL);

   cnP->mapSeg = inode->i_mapping;
   if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
     writeAccess = true;

#if !defined(MMAP_LINUX_PATCH) && (LINUX_KERNEL_VERSION >= 2041803)
   /* Patch must be applied at this kernel level for mmap write */
   writeAccess = false;
#endif

   setCred(&eCred);

   rc = gpfs_ops.gpfsMmap(privVfsP, cnP, (void *)inode, &eCred, NULL,
                          writeAccess,false);

   TRACE2(TRACE_VNODE, 2, TRCID_FM_OPEN_EXIT,
         "gpfs_filemap_open exit: vma %08X icount %d\n",
         vma,atomic_read((atomic_t *)&inode->i_count));
}

void gpfs_filemap_close(struct vm_area_struct * vma)
{
   struct file *fP = vma->vm_file;
   struct inode *inode = fP->f_dentry->d_inode;
   int flags, rc;
   struct gpfsVfsData_t *privVfsP;
   char buf[] = "gpfs_filemap_close()";
   cxiNode_t *cnP = VP_TO_CNP(inode);

   VFS_STAT_START(unmapCall);

   if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
     flags = 0;
   else
     flags |= CXI_SHM_RDONLY;

   privVfsP = VP_TO_PVP(inode);
   TRACE3(TRACE_VNODE, 2, TRCID_FM_CLOSE_ENTER,
          "gpfs_filemap_close: vma 0x%lX inode 0x%lX i_count %d\n",
          vma, inode, (Int32)atomic_read((atomic_t *)&inode->i_count));
   TRACE3(TRACE_VNODE, 2, TRCID_FM_CLOSE_ENTER1,
          "gpfs_filemap_close: i_ino %d, name %s, nrpages %d\n",
          inode->i_ino,
          fP->f_dentry? fP->f_dentry->d_name.name: (const unsigned char*)"",
          inode->i_data.nrpages);

   mmapFlushLock(cnP, buf);
   flush_vma(cnP, vma, vma->vm_start, vma->vm_end, MmfInvalidate);
   mmapFlushUnLock(cnP,buf);
   rc = gpfs_ops.gpfsUnmap(privVfsP, cnP, flags);
   cxiPutOSNode((void *)inode);

   TRACE3(TRACE_VNODE, 2, TRCID_FM_CLOSE,
          "gpfs_filemap_close: vma 0x%lX inode 0x%lX i_count %d\n",
          vma, inode, (Int32)atomic_read((atomic_t *)&inode->i_count));
   VFS_STAT_STOP;
}

int
gpfs_f_mmap(struct file *fP, struct vm_area_struct *vma)
{
  int rc;
  Boolean heldVnode = false;
  Boolean writeAccess = false;
  cxiNode_t *cnP;
  struct gpfsVfsData_t *privVfsP;
  struct inode *iP = fP->f_dentry->d_inode;
  ext_cred_t eCred;
  cxiVattr_t vattr;

  VFS_STAT_START(map_lloffCall);
  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_MMAP_ENTER,
         "gpfs_f_mmap enter fP 0x%lX inum %d vma 0x%1X\n",
         fP, iP->i_ino, vma);

  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_MMAP_ENTER_A,
         "gpfs_f_mmap vm_start 0x%lX vm_end 0x%lX, vmpgoff 0x%lX, "
         "vmflags 0x%lX\n",
         vma->vm_start, vma->vm_end, vma->vm_pgoff, vma->vm_flags);
   TRACE4(TRACE_VNODE, 2, TRCID_LINUXOPS_MMAP_ENTER_A1,
          "gpfs_f_mmap enter: inode %d icount %d name %s nrpages %d\n",
          iP->i_ino, atomic_read((atomic_t *)&iP->i_count),
          fP->f_dentry ? fP->f_dentry->d_name.name : (const unsigned char*)"",
          iP->i_data.nrpages);

  cnP = VP_TO_CNP(iP);
  privVfsP = VP_TO_PVP(iP);
  DBGASSERT(privVfsP != NULL);

  cnP->mapSeg = iP->i_mapping;
  if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
    writeAccess = true;

#if !defined(MMAP_LINUX_PATCH) && (LINUX_KERNEL_VERSION >= 2041803)
  /* Patch must be applied at this kernel level for mmap write */
  if ((vma->vm_flags & VM_MAYWRITE) || (vma->vm_flags & VM_WRITE))
    return EINVAL;
#endif

  setCred(&eCred);

  rc = gpfs_ops.gpfsMmap(privVfsP, cnP, (void *)iP, &eCred, NULL, 
                         writeAccess,true);
  if (rc == 0)
  {
    heldVnode = true;
    rc = gpfs_file_mmap(fP, vma);
  }

xerror:
  if (rc != 0 && heldVnode)
   cxiPutOSNode((void *)iP); // corresponding hold in gpfsMmap

  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_MMAP_EXIT,
         "gpfs_f_mmap exit rc %d\n", rc);
  VFS_STAT_STOP;
  return rc;
}

int
gpfs_f_open(struct inode *iP, struct file *fP)
{
  int rc = 0;
  int code = 0;
  Boolean gotBKL = false;
  int flags = cxiOpenFlagsXlate(fP->f_flags);
  cxiNode_t *cnP;
  struct gpfsVfsData_t *privVfsP;
  ext_cred_t eCred;

  VFS_STAT_START(openCall);
  TRACE7(TRACE_VNODE, 1, TRCID_LINUXOPS_OPEN_ENTER,
         "gpfs_f_open enter: iP 0x%lX fP 0x%lX f_flags 0x%X dP 0x%lX '%s' "
         "flags 0x%X isNFS %d\n", iP, fP, fP->f_flags, fP->f_dentry,
         fP->f_dentry? fP->f_dentry->d_name.name: (const unsigned char*)"",
         flags, cxiIsNFSThread());

  /* BKL is not held at entry, except for NFS calls */
  TraceBKL();
  if (current->lock_depth >= 0)  /* kernel lock is held by me */
  {
    gotBKL = true;
    unlock_kernel();
  }

  setCred(&eCred);

  cnP = VP_TO_CNP(iP);
  privVfsP = VP_TO_PVP(iP);
  DBGASSERT(privVfsP != NULL);

  /* see comment in gpfs_i_create() on the reason for this code */
  if (cnP->createRaceLoserThreadId &&
      cnP->createRaceLoserThreadId == cxiGetThreadId())
  {
    int fflags = cxiOpenFlagsXlate(fP->f_flags);
    int amode;

    cnP->createRaceLoserThreadId = 0;
    code = EEXIST;

    amode = ((flags & FWRITE ? W_ACC : 0) |
             (flags & FREAD ? R_ACC : 0)  |
             (flags & FTRUNC ? W_ACC : 0));

    TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_OPEN_01,
           "gpfs_f_open fileExist iP 0x%lX cnP 0x%lX fflags 0x%X amode 0x%X\n",
           iP, cnP, fflags, amode);

    /* Check if FEXCL and FCREAT are on and the file exists return EEXIST
     * could not do it at create time because the open flgas are not availble
     * on the create call.
     */
    if ((flags & FEXCL) && (flags & FCREAT))
    {
      rc = EEXIST;
      goto xerror;
    }

    rc = gpfs_ops.gpfsAccess(privVfsP, cnP, amode, ACC_SELF, &eCred);
    if (rc)
      goto xerror;
  }

  if (cxiIsNFSThread() && GNP_IS_FILE(cnP))
  {
    int NFSflags;
    int code;

    BEGIN_FAR_CODE;
    /* Linux NFS will not do vget so the clone vnode cannot be created then.
       Need to GetNFS here so the NFS structures will be available. */

    NFSflags = FWRITE|FREAD;
    rc = gpfs_ops.gpfsGetNFS((void *)iP,
                             (struct MMFSVInfo **)&fP->private_data,
                             &NFSflags);
    if (rc != 0)
    {
      code = ENOSYS; //??EGET_NFS;
      goto xerror;
    }
    DBGASSERT((struct MMFSVInfo *)fP->private_data != NULL);

    END_FAR_CODE;
    goto xerror;
  }

  rc = gpfs_ops.gpfsOpen(privVfsP, cnP, flags, 0,
                         (struct MMFSVInfo **)&fP->private_data, &eCred);

xerror:
  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_OPEN_EXIT,
         "gpfs_f_open exit: iP 0x%lX vinfoP 0x%lX code %d rc %d\n",
         iP, (struct MMFSVInfo *)fP->private_data, code, rc);

  VFS_STAT_STOP;

  if (gotBKL)        /* If held kernel lock on entry then reacquire it */
    lock_kernel();

  return (-rc);
}

int
gpfs_f_release(struct inode *iP, struct file *fP)
{
  int rc = 0;
  int code = 0;
  int flags = cxiOpenFlagsXlate(fP->f_flags);
  struct MMFSVInfo *vinfoP = (struct MMFSVInfo *)fP->private_data;
  cxiNode_t *cnP;
  struct gpfsVfsData_t *privVfsP;

  VFS_STAT_START(closeCall);
  cnP = VP_TO_CNP(iP);
  privVfsP = VP_TO_PVP(iP);
  DBGASSERT(privVfsP != NULL);

  DBGASSERT(vinfoP != NULL);
  TRACE4(TRACE_VNODE, 1, TRCID_LINUXOPS_CLOSE_ENTER,
         "gpfs_f_release enter: iP 0x%X f_flags 0x%X flags 0x%X vinfoP 0x%X\n",
         iP, fP->f_flags, flags, vinfoP);
  /* BKL is held if the file was open R/W, otherwise not held */

  /* If nfsd is closing one of its files, schedule it for a delayed close. */
  if (cnP && VP_TO_NFSP(iP) && cxiIsNFSThread())
  {
    DBGASSERT(GNP_IS_FILE(cnP));

    /* On the last NFS release, a watchdog will be set to close the file
       after a delay. */

    rc = gpfs_ops.gpfsReleaseNFS(iP);

    goto xerror;
  }

  rc = gpfs_ops.gpfsClose(privVfsP, cnP, flags, vinfoP, true);

  fP->private_data = NULL;  // MMFSVInfo was freed

xerror:
  TRACE2(TRACE_VNODE, 1, TRCID_CLOSE_EXIT,
         "gpfs_f_release exit: code %d rc %d\n", code, rc);

  VFS_STAT_STOP;
  return (-rc);
}

int
gpfs_f_fsync(struct file *fP, struct dentry *direntP, int datasync)
{
  int rc;

  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_FSYNC_ENTER,
         "gpfs_f_fsync enter: fP 0x%lX dirent 0x%lX datasync %d\n",
         fP, direntP, datasync);
  /* Linux doc says BKL is held, but it does not seem to be */

  rc = fsyncInternal(fP);

  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_FSYNC_EXIT,
         "gpfs_f_fsync exit: file 0x%lX rc %d\n", fP, rc);

  return (-rc);
}

int
gpfs_f_fasync(int fd, struct file *fP, int on)
{
  int rc;

  TRACE3(TRACE_VNODE, 1, TRCID_LINUXOPS_FASYNC_ENTER,
         "gpfs_f_fasync enter: fd %d fP 0x%lX on %d\n",
         fd, fP, on);
  /* Linux doc says BKL is held, but it does not seem to be */

  rc = fsyncInternal(fP);

  TRACE2(TRACE_VNODE, 1, TRCID_LINUXOPS_FASYNC_EXIT,
         "gpfs_f_fasync exit: fP 0x%lX rc %d\n", fP, rc);

  return (-rc);
}

int
fsyncInternal(struct file *fP)
{
  int rc = 0;
  int code = 0;
  cxiNode_t *cnP;
  struct inode *iP;
  struct gpfsVfsData_t *privVfsP;
  ext_cred_t eCred;

  VFS_STAT_START(fsyncCall);
  /* Creating files via nfs can get us here with a null fP. */
  if (!fP)
    goto xerror;

  iP = fP->f_dentry->d_inode;
  DBGASSERT(iP != NULL);

  cnP = VP_TO_CNP(iP);
  privVfsP = VP_TO_PVP(iP);
  DBGASSERT(privVfsP != NULL);

  setCred(&eCred);
  rc = gpfs_ops.gpfsFsync(privVfsP, cnP, FFILESYNC, &eCred);

xerror:
  return rc;
  VFS_STAT_STOP;
}

int
gpfs_f_check_media_change(kdev_t dev)
{
  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_MEDIA,
         "gpfs_f_check_media_change enter & exir rc 0: dev 0x%X\n", dev);
  return 0;
}

int
gpfs_f_revalidate(kdev_t dev)
{
  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_REVALIDATE_DEV,
         "gpfs_f_revalidate enter & exir rc 0: dev 0x%X\n", dev);
  return 0;
}

int
gpfs_f_lock(struct file *fP, int cmd, struct file_lock *flP)
{
  int rc = 0;
  int code = 0;
  cxiNode_t *cnP;
  ext_cred_t eCred;
  struct gpfsVfsData_t *privVfsP;
  eflock_t lckdat;
  unsigned long localRetryId = 0;
  struct inode *iP = fP->f_dentry->d_inode;
  struct MMFSVInfo *vinfoP = (struct MMFSVInfo *)fP->private_data;
#ifdef SMB_LOCKS
  int nodenumber = 0;
  Boolean movelock = false;
#endif

  VFS_STAT_START(lockctlCall);

#ifdef SMB_LOCKS
  /* The interface for moving locks is to provide the origin node number  *
   * as the high order 16 bits of the cmd value.  In addition, the 0x8000 *
   * bit should be on in case the node number is zero.  This interface is *
   * a unique function provided to allow lock movement for NFS within the *
   * Stingray / Squid NAS products.                                       */
  if (cmd > 0x7FFF)
  {
    nodenumber = (cmd & 0xFFFF0000)>>16;
    movelock = true;
    cmd = cmd & 0x7fff;
  }
#endif

  /* Linux converts flock64 to flock before calling GPFS lock routine,
     but leaves "cmd" as is. Allow these to go through. */
#if !defined(__64BIT__)
  if (cmd == F_GETLK64) cmd = F_GETLK;
  if (cmd == F_SETLK64) cmd = F_SETLK;
  if (cmd == F_SETLKW64) cmd = F_SETLKW;
#endif

  if ((cmd != F_GETLK) && (cmd != F_SETLK) && (cmd != F_SETLKW))
  {
    code = 2;
    rc = ENOSYS;
    goto xerror;
  }

  setCred(&eCred);
  TRACE6(TRACE_VNODE, 1, TRCID_LINUXOPS_LOCKCTL_ENTER,
         "gpfs_f_lock enter: pid %d fp 0x%X range 0x%lX:%lX cmd %s type %s\n",
         flP->fl_pid, fP, flP->fl_start, flP->fl_end,
         (cmd == F_GETLK) ? "GETLK" : (cmd == F_SETLK) ? "SETLK" : "SETLKW",
         (flP->fl_type == F_RDLCK) ? "RDLCK" :
         (flP->fl_type == F_WRLCK) ? "WRLCK" : "UNLCK");

  TRACE5(TRACE_VNODE, 3, TRCID_LINUXOPS_LOCKCTL_ENTER2,
         "gpfs_f_lock       : pos 0x%lX iP 0x%X fl_flags 0x%X uid %d gid %d\n",
         fP->f_pos, fP->f_dentry->d_inode, flP->fl_flags,
         eCred.principal, eCred.group);
  TraceBKL();
#ifdef SMB_LOCKS
  if (movelock)
    TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_LOCKCTL_ENTER3,
          "gpfs_f_lock  enter: Move lock from node %d", nodenumber);
#endif

  cnP = VP_TO_CNP(fP->f_dentry->d_inode);
  privVfsP = VP_TO_PVP(fP->f_dentry->d_inode);
  DBGASSERT(privVfsP != NULL);
  DBGASSERT(vinfoP != NULL);

  /* convert file_lock to eflock */
  cxiVFSToFlock((void *)flP, &lckdat);

  lckdat.l_whence = SEEK_SET;
#ifdef SMB_LOCKS
  if (movelock)
    cmd = cmd | 0x8000 | (nodenumber << 16);
#endif

  rc = gpfs_ops.gpfsFcntl(NULL,    // KernelOperation initialized in gpfsFcntl
                          privVfsP,
                          NULL,     // struct vnode *vP or NULL
                          fP->f_dentry->d_inode,  // advObjP (advisory lock object) is inode
                          flP,      // struct file_lock
                          cnP,
                          0,        // offset
                          &lckdat,  // struct cxiFlock_t
                          cmd,
                          NULL,     // application retry_fcn not used
                          &localRetryId,
                          &eCred,
                          false);   // NOT token acquire only

xerror:

  TRACE2(TRACE_VNODE, 11, TRCID_LINUXOPS_LOCKCTL_DIAG2,
         "gpfs_f_lock: fP 0x%X, f_dentry 0x%X",
         fP, fP->f_dentry);

  VFS_STAT_STOP;

  TRACE1(TRACE_VNODE, 3, TRCID_LINUXOPS_LOCKCTL_EXIT,
         "gpfs_f_lock exit: rc %d",
         rc);

  return (-rc);
}

int
rdwrInternal(struct file *fP, cxiRdWr_t op, const struct cxiIovec_t *iovecP,
             unsigned long count, loff_t *offsetP)
{
  int i, rc;
  int code = 0;
  Boolean gotBKL = false;
  size_t total_len = 0;
  struct cxiUio_t tmp_uio;
  int flags = cxiOpenFlagsXlate(fP->f_flags);
  struct gpfsVfsData_t *privVfsP;
  cxiNode_t *cnP;
  struct MMFSVInfo *vinfoP = (struct MMFSVInfo *)fP->private_data;
  struct inode *iP;
  ext_cred_t eCred;

  VFS_STAT_START((op == CXI_READ)? readCall: writeCall);
  DBGASSERT(fP != NULL);
  iP = fP->f_dentry->d_inode;
  DBGASSERT(iP != NULL);

  TRACE11(TRACE_VNODE, 1, TRCID_LINUXOPS_RDWRINT_ENTER,
          "gpfs_f_rdwr enter: fP 0x%lX f_flags 0x%X flags 0x%X op %d "
          "iovec 0x%lX count %d offset 0x%llX "
          "dentry 0x%lX private 0x%lX iP 0x%lX name '%s'\n",
          fP, fP->f_flags, flags, op, iovecP, count, *offsetP, fP->f_dentry,
          fP->private_data, fP->f_dentry->d_inode, fP->f_dentry->d_name.name);

  /* BKL is not held at entry, except for NFS calls */
  TraceBKL();
  if (current->lock_depth >= 0)  /* kernel lock is held by me */
  {
    gotBKL = true;
    unlock_kernel();
  }

  privVfsP = VP_TO_PVP(iP);
  DBGASSERT(privVfsP != NULL);
  cnP = VP_TO_CNP(iP);

  tmp_uio.uio_iov = (struct cxiIovec_t *)iovecP; /* ptr to iovec struct array */
  tmp_uio.uio_iovcnt = count;    /* #iovec elements remaining to be processed*/
  tmp_uio.uio_iovdcnt = 0;       /* #iovec elements already processed   */
  tmp_uio.uio_offset = *offsetP; /* byte offset in file/dev to read/write*/
  tmp_uio.uio_segflg = 0;        /* see segment flag value */
  tmp_uio.uio_fmode = 0;         /* copy of file modes from open file struct */

  for (i = 0; i < count; i++)
    total_len += iovecP[i].iov_len;

  tmp_uio.uio_resid = total_len; /* #bytes left in data area */

  DBGASSERT(vinfoP != NULL);

  setCred(&eCred);
  if (op == CXI_READ)
    rc = gpfs_ops.gpfsRead(privVfsP, cnP, flags, &tmp_uio,
                           vinfoP, NULL, &eCred, false);
  else
  {
    rc = gpfs_ops.gpfsWrite(privVfsP, cnP, flags, &tmp_uio,
                            vinfoP, NULL, &eCred, false);
#if LINUX_KERNEL_VERSION >= 2040900
    iP->i_sb->s_dirt = 1;
#endif
  }

  TRACE5(TRACE_VNODE, 1, TRCID_LINUXOPS_RDWRINT_EXIT,
         "gpfs_f_rdwr exit: fP 0x%lX total_len %d uio_resid %d "
         "offset 0x%llX rc %d\n", fP, total_len, tmp_uio.uio_resid,
         tmp_uio.uio_offset, rc);

xerror:
  VFS_STAT_STOP;

  if (gotBKL)        /* If held kernel lock on entry then reacquire it */
    lock_kernel();

  if (rc)
    return (-rc);

  *offsetP = tmp_uio.uio_offset;
  return (total_len - tmp_uio.uio_resid);
}

ssize_t
gpfs_f_read(struct file *fP, char *bufP, size_t count,
            loff_t *offsetP)
{
  int rc;
  cxiIovec_t tmp_iovec;

  tmp_iovec.iov_base = bufP;    /* base memory address                  */
  tmp_iovec.iov_len = count;    /* length of transfer for this area     */

  rc = rdwrInternal(fP, CXI_READ, &tmp_iovec, 1, offsetP);

  return rc;
}

ssize_t
gpfs_f_dir_read(struct file *fP, char *bufP, size_t count,
                loff_t *offsetP)
{
  TRACE1(TRACE_VNODE, 1, TRCID_LINUXOPS_READ_DIR,
         "gpfs_f_dir_read: fP 0x%lX EISDIR\n", fP);
  return -EISDIR;
}

ssize_t
gpfs_f_write(struct file *fP, const char *bufP, size_t count,
             loff_t *offsetP)
{
  int rc;
  cxiIovec_t tmp_iovec;

  tmp_iovec.iov_base = (char *)bufP; /* base memory address              */
  tmp_iovec.iov_len = count;         /* length of transfer for this area */

  rc = rdwrInternal(fP, CXI_WRITE, &tmp_iovec, 1, offsetP);

  return rc;
}

ssize_t
gpfs_f_readv(struct file *fP, const struct iovec *iovecP,
             unsigned long count, loff_t *offsetP)
{
  return rdwrInternal(fP, CXI_READ, (const struct cxiIovec_t *)iovecP,
		      count, offsetP);
}

int
gpfs_f_writev(struct file *fP, const struct iovec *iovecP,
              unsigned long count, loff_t *offsetP)
{
  return rdwrInternal(fP, CXI_WRITE, (const struct cxiIovec_t *)iovecP,
		      count, offsetP);
}

extern int Cleanup_fd;
/* gpfs_f_cleanup is a routine that runs when the last mmfsd
   process terminates.  It allows us to do some basic cleanup
   so that the daemon can be restarted nicely. */

int gpfs_f_cleanup(struct inode *iP, struct file *fP)
{
  int rc = 0;

  if (Cleanup_fd)
  {
    rc = gpfs_ops.gpfsCleanup();
    Cleanup_fd = 0;
  }
  return rc;
}

